home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/bin/perl
- # gla, boone, 06/18/92
- # Gopher log analyzer
- # Copyright (C) 1992, Michigan State University Board of Trustees
- #
- # Version 1.1
- #
- # Mail to: Dennis Boone <drbmaint@msu.edu>
- #
- # Modifications:
- # 06/18/92 Boone Initial coding
- # 11/13/92 Boone Added percentages
- # End Modifications
- #
- # Description:
- # gla reads a gopher log file and extracts statistics: times a file was
- # referenced and times any given domain came calling. Host names and
- # IP addresses are relieved of the most specific portion to ensure privacy.
- # There are five sections in the gla report: Hostnames in alpha order,
- # hostnames in order of frequency, filenames in alpha order, filenames in
- # order of frequency, and overall statistics. Each section can be completely
- # eliminated with a command line option. In addition, the number of hosts or
- # files reported can be limited using a command line option, for example to
- # generate a "most popular file" report.
- #
- # The gopher log is structured as follows:
- #
- # date/time [port] hostname : action
- #
- # date/time is a standard unix date format: ddd mmm dd hh:mm:ss yyyy
- # port is the client port number; it isn't present in the log files
- # written by all versions of the gopher server
- # hostname is the fqdn or IP address of the calling host
- # action is one of the following:
- # retrieved file path
- # retrieved directory path
- # Root Connection
- #
- # In addition, older versions of gopherd logged a start time in the
- # log file, which is filtered.
- #
- # Usage: gla [options] <logfile >reportfile
- #
- # options:
- # -ha skip host-alpha section of report
- # -hf skip host-freq section of report
- # -fa skip file-alpha section of report
- # -ff skip file-freq section of report
- # -sum skip summary section of report
- # -hlim n limit host sections of report to n hosts
- # -flim n limit file sections of report to n files
- #
- # End Description
-
- $home = $ENV{'HOME'};
- push(@INC, "$home/bin");
-
- $mname{"Jan"} = 0; $mname{"Jul"} = 6;
- $mname{"Feb"} = 1; $mname{"Aug"} = 7;
- $mname{"Mar"} = 2; $mname{"Sep"} = 8;
- $mname{"Apr"} = 3; $mname{"Oct"} = 9;
- $mname{"May"} = 4; $mname{"Nov"} = 10;
- $mname{"Jun"} = 5; $mname{"Dec"} = 11;
-
- $secs_leap = 31622400;
- $secs_norm = 31536000;
- $secs_day = 86400;
- $secs_hour = 3600;
- $d28 = 2419200;
- $d29 = 2505600;
- $d30 = 2592000;
- $d31 = 2678400;
- @secs_month = ($d31, $d28, $d31, $d30, $d31, $d30,
- $d31, $d31, $d30, $d31, $d30, $d31);
- @secs_leap_month = ($d31, $d29, $d31, $d30, $d31, $d30,
- $d31, $d31, $d30, $d31, $d30, $d31);
-
- #
- # Is it leap year?
- #
- sub isleap
- {
- local($i) = @_;
-
- if ($i % 4) { return 0; }
- if ((!($i % 100)) && ($i % 400)) { return 0; } else { return 1; }
- }
-
- #
- # Compute seconds since Jan 1, 1970 without going south like timelocal()
- #
- sub epoch
- {
- local($tstr) = @_;
- local($i);
-
- @parts = unpack("a3 x a3 x a2 x a2 x a2 x a2 x a4", $tstr);
- if (substr($parts[2], 0, 1) == " ")
- { $parts[2] = substr($parts[2], 1, 1); }
- $since = 0;
- for ($i = 1970; $i < $parts[6]; $i++)
- {
- if (&isleap($i))
- {
- $since += $secs_leap;
- }
- else
- {
- $since += $secs_norm;
- }
- }
- for ($i = 0; $i < $mname{$parts[1]}; $i++)
- {
- if (&isleap($parts[6]))
- {
- $since += $secs_leap_month[$i];
- }
- else
- {
- $since += $secs_month[$i];
- }
- }
- for ($i = 1; $i < $parts[2]; $i++)
- {
- $since += $secs_day;
- }
- for ($i = 0; $i < $parts[3]; $i++)
- {
- $since += $secs_hour;
- }
- for ($i = 0; $i < $parts[4]; $i++)
- {
- $since += 60;
- }
- $since += $parts[5];
- return $since;
- }
-
- #
- # Comparison routines for sorting on the value of an entry in an
- # associative array
- #
- sub hostbyval # Sort on count of calls from this host
- {
- $host{$b} <=> $host{$a};
- }
-
- sub filebyval # Sort on count of calls for this file
- {
- $file{$b} <=> $file{$a};
- }
-
- #
- # Clip name, leaving domain
- # Or keep first three octets of IP address
- #
- sub chophost
- {
- local($h) = @_;
- local($r);
- $h =~ tr/[A-Z]/[a-z]/; # Force lowercase
- if (/([0-9]*\.){3}[0-9]*/) # Is it an IP address?
- {
- $h =~ /([0-9]+\.[0-9]+\.[0-9]+\.)[0-9]+/;
- $r = $1;
- }
- else
- {
- $h =~ /[^\s.]+\.([^\s]+)/;
- $r = $1;
- }
- return $r;
- }
-
- #
- # Increment appropriate counters
- #
- sub tally
- {
- / ([^\s]+) :/; # Extract host name from log line
- $th = &chophost($1); # Remove incriminating information
- $tf = $2 if /retrieved ([^\s]+) (.*)$/;
- $tf = "Root Connection" if /Root Connection/;
- $host{$th}++;
- $file{$tf}++;
- $callcnt++;
- if (! ($callcnt % 500))
- {
- print STDERR "$callcnt records processed.\r";
- }
- if ($firstline)
- {
- $firstline = 0;
- $firstdate = substr($_, $[, 24);
- }
- $lastdate = substr($_, $[, 24);
- }
-
- #
- # Print reports
- #
- sub report
- {
- foreach (keys %file)
- {
- $filecnt++;
- }
- foreach (keys %host)
- {
- $hostcnt++;
- }
-
- if ($haflag)
- {
- # Report domain names in alpha order
- $^ = "TOPHOSTALPHA"; # Set top-of-form format
- $~ = "HOSTALPHA"; # Set detail format
- local($i) = 0;
- foreach $key (sort keys(%host))
- {
- $host = $key;
- $calls = $host{$key};
- $pct = sprintf("%5.1f", ($calls / $callcnt) * 100);
- write;
- $i++; last if ($hlim && ($i == $hlim));
- }
- $- = 0; # Force end-of-page
- }
-
- if ($hfflag)
- {
- # Report domain names in descending order of call frequency
- $^ = "TOPHOSTFREQ";
- $~ = "HOSTFREQ";
- local($i) = 0;
- local($tcalls) = 0;
- foreach $key (sort hostbyval keys(%host))
- {
- $host = $key;
- $calls = $host{$key};
- $tcalls += $calls;
- $pct = sprintf("%5.1f", ($calls / $callcnt) * 100);
- $cum = sprintf("%5.1f", ($tcalls / $callcnt) * 100);
- write;
- $i++; last if ($hlim && ($i == $hlim));
- }
- $- = 0;
- }
-
- if ($faflag)
- {
- # Report file names in alpha order
- $^ = "TOPFILEALPHA";
- $~ = "FILEALPHA";
- local($i) = 0;
- foreach $key (sort keys(%file))
- {
- $file = $key;
- $calls = $file{$key};
- $pct = sprintf("%5.1f", ($calls / $callcnt) * 100);
- write;
- $i++; last if ($flim && ($i == $flim));
- }
- $- = 0;
- }
-
- if ($ffflag)
- {
- # Report file names in descending order of frequency of use
- $^ = "TOPFILEFREQ";
- $~ = "FILEFREQ";
- local($i) = 0;
- local($tcalls) = 0;
- foreach $key (sort filebyval keys(%file))
- {
- $file = $key;
- $calls = $file{$key};
- $tcalls += $calls;
- $pct = sprintf("%5.1f", ($calls / $callcnt) * 100);
- $cum = sprintf("%5.1f", ($tcalls / $callcnt) * 100);
- write;
- $i++; last if ($flim && ($i == $flim));
- }
- $- = 0;
- }
-
- if ($sumflag)
- {
- # Report cheesy counters
- $^ = "TOPSUMMARY";
- $~ = "SUMMARY";
- write;
- }
- }
-
- ###############################################################################
-
- $= = 55; # Max lines per page
- $[ = 0; # Array subscript base
- $firstline = 1; # Keep first date stamp in file
- $haflag = 1; # Want host-alpha report?
- $hfflag = 1; # Want host-freq report?
- $faflag = 1; # Want file-alpha report?
- $ffflag = 1; # Want file-freq report?
- $sumflag = 1; # Want summary report?
- $hlim = 0; # Unlimited number of hosts in report
- $flim = 0; # Unlimited number of files in report
-
- while ($arg = shift @ARGV)
- {
- if ($arg eq "-ha") { $haflag = 0; next; }
- if ($arg eq "-hf") { $hfflag = 0; next; }
- if ($arg eq "-fa") { $faflag = 0; next; }
- if ($arg eq "-ff") { $ffflag = 0; next; }
- if ($arg eq "-sum") { $sumflag = 0; next; }
- if ($arg eq "-hlim") { $hlim = shift @ARGV; next; }
- if ($arg eq "-flim") { $flim = shift @ARGV; next; }
- print STDERR "$ARGV[0]: unrecognized argument: $arg\n";
- exit(1);
- }
-
- while (<>) # Gobble the whole log file
- {
- next if /^$/; # Throw away blank lines
- next if /Starting gopher daemon/; # Throw away start lines
- &tally; # Keep count
- }
- print STDERR "$callcnt records processed.\n";
- $startjul = &epoch($firstdate);
- $endjul = &epoch($lastdate);
- $period = $endjul - $startjul;
- $callspace = $period / $callcnt;
- &report; # Print reports
- exit;
-
- ###############################################################################
-
- format TOPHOSTALPHA =
-
- Michigan State University Gopher Log Analyzer Page: @<<
- $%
- Client Domains by Name
-
- Domain Domain or IP Address Calls %
- ----------------------------------------------------- ------- -----
- .
-
- format HOSTALPHA =
- @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @>>>>>> @>>>>
- $host $calls $pct
- .
-
- format TOPHOSTFREQ =
-
- Michigan State University Gopher Log Analyzer Page: @<<
- $%
- Client Domains by Frequency of Use
-
- Host Domain or IP Address Calls % Cum
- ----------------------------------------------------- ------- ----- -----
- .
-
- format HOSTFREQ =
- @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @>>>>>> @>>>> @>>>>
- $host $calls $pct $cum
- .
-
- format TOPFILEALPHA =
-
- Michigan State University Gopher Log Analyzer Page: @<<
- $%
- Retrieved Items by Name
-
- File Description Uses %
- ----------------------------------------------------- ------- -----
- .
-
- format FILEALPHA =
- @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @>>>>>> @>>>>
- $file $calls $pct
- .
-
- format TOPFILEFREQ =
-
- Michigan State University Gopher Log Analyzer Page: @<<
- $%
- Retrieved Items by Frequency of Use
-
- File Description Uses % Cum
- ----------------------------------------------------- ------- ----- -----
- .
-
- format FILEFREQ =
- @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @>>>>>> @>>>> @>>>>
- $file $calls $pct $cum
- .
-
- format TOPSUMMARY =
-
- Michigan State University Gopher Log Analyzer Page: @<<
- $%
- Report Summary
-
- .
-
- format SUMMARY =
- Starting date: @<<<<<<<<<<<<<<<<<<<<<<<<
- $firstdate
- Ending date: @<<<<<<<<<<<<<<<<<<<<<<<<
- $lastdate
-
- Total calls: @<<<<<<<<<
- $callcnt
- Average seconds between calls: @<<<<<<
- $callspace
-
- Different domains: @<<<<<<<<<
- $hostcnt
- Different files: @<<<<<<<<<
- $filecnt
- .
-